Leo Lu
2018-03-31
呂奕 Leo Lu
mpg dataset: Fuel economy data from 1999 and 2008 for 38 popular models of car.| variable | detail |
|---|---|
| manufacturer | 車廠 |
| model | 型號 |
| displ | 引擎排氣量 |
| year | 出廠年份 |
| cyl | 氣缸數 |
| trans | 自/手排 |
| drv | f = front-wheel drive, r = rear wheel drive, 4 = 4wd |
| cty | city miles per gallon 城市駕駛油耗 |
| hwy | highway miles per gallon 高速公路駕駛油耗 |
| fl | 汽油: ethanol E85, diesel, regular, premium, CNG |
| class | 車型 |
ggplot(data = <DATA>) + # Data
geom_<xxx>(
mapping = aes(<MAPPINGS>), ## <= Aesthetic mappings
stat = <STAT>,
position = <POSITION>
) +
scale_<xxx>() + coord_<xxx>() + facet_<xxx>()
theme_()ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color = class), size = 3)ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, shape = class))ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
geom_smooth(mapping= aes(x = displ, y = hwy))ggplot(data = mpg) +
geom_bar(mapping = aes(x = class)) +
facet_wrap( ~ manufacturer, ncol = 3)library(gapminder)
gapminder#> # A tibble: 1,704 x 6
#> country continent year lifeExp pop gdpPercap
#> <fct> <fct> <int> <dbl> <int> <dbl>
#> 1 Afghanistan Asia 1952 28.8 8425333 779.
#> 2 Afghanistan Asia 1957 30.3 9240934 821.
#> 3 Afghanistan Asia 1962 32.0 10267083 853.
#> 4 Afghanistan Asia 1967 34.0 11537966 836.
#> 5 Afghanistan Asia 1972 36.1 13079460 740.
#> 6 Afghanistan Asia 1977 38.4 14880372 786.
#> 7 Afghanistan Asia 1982 39.9 12881816 978.
#> 8 Afghanistan Asia 1987 40.8 13867957 852.
#> 9 Afghanistan Asia 1992 41.7 16317921 649.
#> 10 Afghanistan Asia 1997 41.8 22227415 635.
#> # ... with 1,694 more rows
tidyr)purrr)broom)library(dplyr)
library(tidyr)
gapminder <- gapminder %>% mutate(year1950 = year - 1950)
by_country <- gapminder %>%
group_by(continent, country) %>%
nest
by_country#> # A tibble: 142 x 3
#> continent country data
#> <fct> <fct> <list>
#> 1 Asia Afghanistan <tibble [12 × 5]>
#> 2 Europe Albania <tibble [12 × 5]>
#> 3 Africa Algeria <tibble [12 × 5]>
#> 4 Africa Angola <tibble [12 × 5]>
#> 5 Americas Argentina <tibble [12 × 5]>
#> 6 Oceania Australia <tibble [12 × 5]>
#> 7 Europe Austria <tibble [12 × 5]>
#> 8 Asia Bahrain <tibble [12 × 5]>
#> 9 Asia Bangladesh <tibble [12 × 5]>
#> 10 Europe Belgium <tibble [12 × 5]>
#> # ... with 132 more rows
lm(lifeExp ~ year, data = Afghanistan)
lm(lifeExp ~ year, data = Afghanistan)
...library(purrr)
country_model <- function(df) {
lm(lifeExp ~ year1950, data = df)
}
models <- by_country %>%
mutate(mod_lm = map(data, country_model))
models#> # A tibble: 142 x 4
#> continent country data mod_lm
#> <fct> <fct> <list> <list>
#> 1 Asia Afghanistan <tibble [12 × 5]> <S3: lm>
#> 2 Europe Albania <tibble [12 × 5]> <S3: lm>
#> 3 Africa Algeria <tibble [12 × 5]> <S3: lm>
#> 4 Africa Angola <tibble [12 × 5]> <S3: lm>
#> 5 Americas Argentina <tibble [12 × 5]> <S3: lm>
#> 6 Oceania Australia <tibble [12 × 5]> <S3: lm>
#> 7 Europe Austria <tibble [12 × 5]> <S3: lm>
#> 8 Asia Bahrain <tibble [12 × 5]> <S3: lm>
#> 9 Asia Bangladesh <tibble [12 × 5]> <S3: lm>
#> 10 Europe Belgium <tibble [12 × 5]> <S3: lm>
#> # ... with 132 more rows
models <- models %>%
mutate(
glance = mod_lm %>% map(broom::glance),
tidy = mod_lm %>% map(broom::tidy),
augment = mod_lm %>% map(broom::augment),
rsq = glance %>% map_dbl("r.squared")
)
models#> # A tibble: 142 x 8
#> continent country data mod_lm glance tidy augment rsq
#> <fct> <fct> <list> <list> <list> <list> <list> <dbl>
#> 1 Asia Afghanistan <tibble… <S3: l… <data.f… <data.… <data.fr… 0.948
#> 2 Europe Albania <tibble… <S3: l… <data.f… <data.… <data.fr… 0.911
#> 3 Africa Algeria <tibble… <S3: l… <data.f… <data.… <data.fr… 0.985
#> 4 Africa Angola <tibble… <S3: l… <data.f… <data.… <data.fr… 0.888
#> 5 Americas Argentina <tibble… <S3: l… <data.f… <data.… <data.fr… 0.996
#> 6 Oceania Australia <tibble… <S3: l… <data.f… <data.… <data.fr… 0.980
#> 7 Europe Austria <tibble… <S3: l… <data.f… <data.… <data.fr… 0.992
#> 8 Asia Bahrain <tibble… <S3: l… <data.f… <data.… <data.fr… 0.967
#> 9 Asia Bangladesh <tibble… <S3: l… <data.f… <data.… <data.fr… 0.989
#> 10 Europe Belgium <tibble… <S3: l… <data.f… <data.… <data.fr… 0.995
#> # ... with 132 more rows
models %>%
ggplot(aes(rsq, reorder(country, rsq))) + # use factor levels
geom_point(aes(colour = continent)) +
theme(axis.text=element_text(size=rel(0.7)))models %>% unnest(data)#> # A tibble: 1,704 x 8
#> continent country rsq year lifeExp pop gdpPercap year1950
#> <fct> <fct> <dbl> <int> <dbl> <int> <dbl> <dbl>
#> 1 Asia Afghanistan 0.948 1952 28.8 8425333 779. 2.
#> 2 Asia Afghanistan 0.948 1957 30.3 9240934 821. 7.
#> 3 Asia Afghanistan 0.948 1962 32.0 10267083 853. 12.
#> 4 Asia Afghanistan 0.948 1967 34.0 11537966 836. 17.
#> 5 Asia Afghanistan 0.948 1972 36.1 13079460 740. 22.
#> 6 Asia Afghanistan 0.948 1977 38.4 14880372 786. 27.
#> 7 Asia Afghanistan 0.948 1982 39.9 12881816 978. 32.
#> 8 Asia Afghanistan 0.948 1987 40.8 13867957 852. 37.
#> 9 Asia Afghanistan 0.948 1992 41.7 16317921 649. 42.
#> 10 Asia Afghanistan 0.948 1997 41.8 22227415 635. 47.
#> # ... with 1,694 more rows
models %>% unnest(glance, .drop = TRUE)#> # A tibble: 142 x 14
#> continent country rsq r.squared adj.r.squared sigma statistic
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Asia Afghanistan 0.948 0.948 0.942 1.22 181.
#> 2 Europe Albania 0.911 0.911 0.902 1.98 102.
#> 3 Africa Algeria 0.985 0.985 0.984 1.32 662.
#> 4 Africa Angola 0.888 0.888 0.877 1.41 79.1
#> 5 Americas Argentina 0.996 0.996 0.995 0.292 2246.
#> 6 Oceania Australia 0.980 0.980 0.978 0.621 481.
#> 7 Europe Austria 0.992 0.992 0.991 0.407 1261.
#> 8 Asia Bahrain 0.967 0.967 0.963 1.64 291.
#> 9 Asia Bangladesh 0.989 0.989 0.988 0.977 930.
#> 10 Europe Belgium 0.995 0.995 0.994 0.293 1822.
#> # ... with 132 more rows, and 7 more variables: p.value <dbl>, df <int>,
#> # logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>, df.residual <int>
models %>% unnest(tidy, .drop = TRUE)#> # A tibble: 284 x 8
#> continent country rsq term estimate std.error statistic p.value
#> <fct> <fct> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Asia Afghanistan 0.948 (Int… 29.4 0.699 42.0 1.40e-12
#> 2 Asia Afghanistan 0.948 year… 0.275 0.0205 13.5 9.84e- 8
#> 3 Europe Albania 0.911 (Int… 58.6 1.13 51.7 1.79e-13
#> 4 Europe Albania 0.911 year… 0.335 0.0332 10.1 1.46e- 6
#> 5 Africa Algeria 0.985 (Int… 42.2 0.756 55.8 8.22e-14
#> 6 Africa Algeria 0.985 year… 0.569 0.0221 25.7 1.81e-10
#> 7 Africa Angola 0.888 (Int… 31.7 0.804 39.4 2.63e-12
#> 8 Africa Angola 0.888 year… 0.209 0.0235 8.90 4.59e- 6
#> 9 Americas Argentina 0.996 (Int… 62.2 0.167 372. 4.80e-22
#> 10 Americas Argentina 0.996 year… 0.232 0.00489 47.4 4.22e-13
#> # ... with 274 more rows
models %>%
unnest(tidy) %>%
select(continent, country, term, estimate, rsq) %>%
spread(key = term, value = estimate) %>%
ggplot(aes(`(Intercept)`, year1950)) +
geom_point(aes(colour = continent, size = rsq)) +
geom_smooth(se = FALSE)models %>%
unnest(augment) %>%
ggplot(aes(year1950, .resid)) +
geom_line(aes(group = country), alpha = 0.3) +
geom_smooth(se = FALSE) +
geom_hline(yintercept = 0, colour = "red", alpha = 0.7) +
facet_wrap(~continent)Functional programming is very powerful
tidyr: 把物件 (例如 lm) 用 list 存在 columns 裡面purrr: Functional programmingbroom: Models → tidy data